knitr::opts_chunk$set(echo = TRUE)
if (!require("pacman")) install.packages("pacman")
pacman::p_load(dplyr, ggplot2, stringr, tidyr, data.table, DT, ggjoy, ggmap, plotly, leaflet)

In this notebook, I demonstrate some exploratory work to analyse flight delays at take-off. We can script the public data from the US DoT website (at the following url: http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ ID=236&DB_Short_Name=OnTime) to answer questions such as which airport has the most delays, which one is the most on-time airline etc.

Exploratory Data Analysis

# read in data
flight <- fread("376941973_T_ONTIME.csv", stringsAsFactors = F)
airport_lookup <- fread("AIRPORT_ID.csv", stringsAsFactors = F)
airport_lookup <- airport_lookup[, Code:=as.numeric(Code)] # convert to numeric
airline_lookup <- fread("AIRLINE_Lookup.csv", stringsAsFactors = F)
airline_lookup <- airline_lookup[, Code:=as.numeric(Code)] # convert to numeric

A flight delay is defined as the following:

  • flight departed at the gate 15 minutes or more after the scheduled departure time

  • flight arrived at the gate 15 minutes or more after the scheduled arrival time

Most at fault airlines

  1. Find the airlines most at fault in the origin airports that exceed the 90th percentile of overall delays in 2016 for the month of January.
# get the departure data for each airport
depart.df <- flight %>% 
                  # filter out the NAs in DEP_DELAY_NEW variable
                  filter(!is.na(DEP_DEL15)) %>% 
                  group_by(ORIGIN_AIRPORT_ID) %>% 
                  summarise(
                          # calculate the total number of departure airlines 
                          NumOfDEP = n(),
                          # calculate the total number of departure delays
                          NumOfDelayDEP   = sum(DEP_DEL15),
                          MedianDelayDEP  = median(DEP_DELAY),
                          AvgDelayDEP    = round(mean(DEP_DELAY),2),
                          OnTimeRateDEP    = round(100 - 100*mean(DEP_DEL15),1)
                          ) %>% 
                      arrange(-NumOfDelayDEP) %>% ungroup() %>% 
                      filter(NumOfDEP >=5) %>% 
                  # left join the airport lookup table to get the actual name
                    left_join(airport_lookup, by = c("ORIGIN_AIRPORT_ID" = "Code")) %>% 
                  # separate the Description to two columns
                    separate( Description, c("City", "AirportName"), ":")


# get the arrival data for each airport
arrival.df <- flight %>% 
                  # filter out the NAs in DEP_DELAY_NEW variable
                  filter(!is.na(ARR_DEL15)) %>% 
                  group_by(ORIGIN_AIRPORT_ID) %>% 
                  summarise(
                          # calculate the total number of arrival airlines for each airport
                          NumOfARR = n(),
                          # calculate the total number of arrival delays for each airport
                          NumOfDelayARR   = sum(ARR_DEL15),
                          MedianDelayARR  = median(ARR_DELAY),
                          AvgDelayARR     = round(mean(ARR_DELAY),2),
                          OnTimeRateARR    = round(100 - 100*mean(ARR_DEL15),1) 
                          ) %>% 
                      arrange(-NumOfDelayARR) %>% ungroup() %>% 
                      filter(NumOfARR >=5) 

# left join airport lookup table to get the actual name
delay.df <-  left_join(depart.df, arrival.df, by = "ORIGIN_AIRPORT_ID") %>% 
                mutate( NumOfAirlines = NumOfDEP + NumOfARR,
                        NumOfDelay    = NumOfDelayDEP + NumOfDelayARR,
                        # calculate the percentage of airlines for each airport
                        PerOfAirlines = round(100*NumOfAirlines / sum(NumOfAirlines),2),
                        # calculate the percentage of delays for each airport
                        PerOfDelay = round(100*NumOfDelay / sum(NumOfDelay),2)
                        )

# calculate the 90th percentile of all eligible airports
percentile <- (round(nrow(delay.df) * 0.1,0))
exceed90th <- delay.df %>% arrange(-NumOfDelay) %>% head(.,percentile) %>% 
              select(AirportName, City, NumOfAirlines, NumOfDelay)
datatable(exceed90th, class = 'cell-border stripe',
          caption = 'Table 1: A list of airlines most at fault in the origin airports that exceed the 90th percentile of overall delays in January, 2016')

Overall delays for origin airport

  1. Compute the percentage that those airlines most at fault contribute to the overall delays for each origin airport.
delayPerTable <- delay.df %>% arrange(-PerOfDelay)  %>% 
  select(AirportName,City,PerOfDelay,NumOfDelay,NumOfAirlines, ORIGIN_AIRPORT_ID) 

datatable(delayPerTable, class = 'cell-border stripe',
          caption = 'Table 2: A list of the percentage that those airlines most at fault contribute to the overall delays for each origin airport in January, 2016')

Visualizations

  1. Create some visualizations to demonstrate my findings.

For example, we would like to investigate is there any interesting pattern in John F. Kennedy International Airport in New York? Below is a plot of distribution of number of minutes delayed at JFK in January, which shows the variability of minutes delayed for each day. It seems that if you were unlucky to travel on Jan 24th in New York, you would experience an unusual delay for departure.

# top10Delay <- delay.df %>% arrange(-PerOfDelay) %>% 
#   head(.,10) %>% select(ORIGIN_AIRPORT_ID) %>% as.data.frame() %>% t() %>% as.vector() 
# top10.df <- flight %>% filter(ORIGIN_AIRPORT_ID %in% top10Delay) 

ggplot(flight %>% filter(DEP_DELAY < 50,
                           DEP_DELAY > -30,
                         # only keep JFK
                         ORIGIN_AIRPORT_ID == 12478), 
       aes(x = DEP_DELAY, y = factor(FL_DATE))) + geom_joy() + 
  ylab("Date") + xlab("Number of minutes delayed for departure at JFK airport") +
  ggtitle("Distribution of number of minutes delayed at JFK in January")
## Picking joint bandwidth of 2.31

A plot of Percentage of on-time departure vs Percentage of on-time arrival is also shown below. A bigger (busier) airport is expected to have more delays. It’s worth to notice that, for example, Denver International Airport and Chicago O’Hare International Airport have a comparable size of about 38000 flights in January. However, the on-time rate of Denver (85%) is 7% higher than Chicago (77%).

# library(plotly)
plot_ly(
  delay.df, x = ~ OnTimeRateDEP, y = ~ OnTimeRateARR,
  # Hover text:
  text = ~paste(AirportName, ":", NumOfDelay, "delays out of ",NumOfAirlines, "flights in Jan"),
  color = ~ NumOfDelay, size = ~ NumOfDelay
        )  %>% layout(xaxis = list(title = "Percentage of on-time departure (%)"), 
                      yaxis = list( title = "Percentage of on-time arrival (%)") 
                      )

Mapping overall delays for origin airport

  1. For each origin airport, we can retrieve the latitudinal and longitudinal coordinates via the Google Geocoding API (https://developers.google.com/maps/documentation/geocoding/intro). ggmap is one of the nice libraries that can extract latitudinal and longitudinal coordinates for a given location in R.
# run this instead to read in geocoded data
geo <- read.csv("geocoded.csv",stringsAsFactors = F)
delay.df$lat <- geo$lat
delay.df$lon <- geo$lon

# not run
# library(ggmap)
# delay.df$address <- paste(delay.df$AirportName,delay.df$City, sep = ",")
# for(i in 1:nrow(delay.df))
# {
#   result <- geocode(delay.df$address[i], output = "latlona", source = "google")
#   delay.df$lon[i] <- as.numeric(result[1])
#   delay.df$lat[i] <- as.numeric(result[2])
#   delay.df$geoAddress[i] <- as.character(result[3])
# }

# one manual fix error
# delay.df$lon[273] <- 13.484728
# delay.df$lat[273] <- 144.7993212
# write.csv(delay.df, "geocoded.csv", row.names=FALSE)

Once I have collected the geo-data, I further map the delays of each origin airport to a geographical map where points are colour coded according to the magnitude of overall delays for that origin airport.

# maps numeric input data to a fixed number of output colors using quantiles
# it seems look better than the continuous scale due to the huge difference between
# the max and min value in the data
qpal <- colorQuantile("RdYlBu", delay.df$NumOfDelay, n = 5, reverse=T)

# built an interactive map using leaflet
leaflet(data = delay.df) %>%
    # Base groups
  addTiles(group = "OSM (default)") %>%
  addProviderTiles(providers$Stamen.TonerLite, group = "Toner Lite") %>% 
  addCircles(
    lng = ~lon, lat = ~lat, 
    # represent NumOfDelay using color and radius
    radius = ~NumOfDelay*15, 
    color = ~qpal(NumOfDelay),
    # implement the click function
    popup = ~as.character(paste("On-Time % of DEP and ARR:",OnTimeRateDEP," & ",OnTimeRateARR)),
    # implement the hover function
    label=~as.character(paste(AirportName,"'s delay: ", NumOfDelay, " out of ",NumOfAirlines," flights",sep="")),
     labelOptions = labelOptions(noHide = F,direction = 'auto')
  ) %>%
  addLegend("bottomright",pal = qpal, values = ~NumOfDelay,
    title = "Quantile of overall delays",
    # labFormat = labelFormat(prefix = ""),
    opacity = 1
            ) %>% 
    # Layers control
  addLayersControl(
    baseGroups = c("OSM (default)", "Toner Lite"),
    options = layersControlOptions(collapsed = FALSE)
  ) %>% 
  setView(lng = -96.0589, lat = 32.3601, zoom = 4)